library(Matrix)
library(fdasrvf)
library(lme4)
library(pedigreemm)
library(ggplot2)
library(plotly)
setwd("D:/KCL_2023-2027_PhD/Year 1/Genetics/FMEMs-quantitative-genetics/R_code")
TRFUN25PUP4 = read.delim("TRFUN25PUP4.DAT",header = FALSE)
names(TRFUN25PUP4)<-c("id","sire","dam","trait","x")
df <- data.frame(TRFUN25PUP4)
FirstUniqueIdPos <- which(duplicated(df$id) == FALSE)
N = length(FirstUniqueIdPos) # N = 873 subjects
n = length(df$id) # n = 6860 observations
age_list <- split(df$x,df$id)
trait_list <- split(df$trait,df$id)
## Rescale time interval to [0,1]
## x = (x -min(x))/(max(x) - min(x))
age_list_new <- list()
for (i in 1:N){
age_list_new[[i]] = (age_list[[i]]-min(age_list[[i]]))/(max(age_list[[i]])-min(age_list[[i]]))
}
df$x_rescaled <- unsplit(age_list_new,df$id)
Here we smooth growth curves on the logarithmic scale and then take exponential to recover body mass. This imposes positive smoothing. Smoothing parameter \(\lambda_i\) for each curve is selected by GCV and we restrict \(\lambda \le 10^{-4}\).
agefine <- seq(0,1,length=100) # dense time grid
logmass <- matrix(0,100,N) # store smooth log growth curves
pred_mass <- matrix(0,100,N) # store the smoothed mass recovered by taking exp
lam <- rep(0,N) # smoothing parameter used for each log growth curve
for (i in 1:N){
ss_logmass <- smooth.spline(age_list_new[[i]], log(trait_list[[i]]), cv=FALSE,
all.knots=TRUE)
logmass[,i] <- predict(ss_logmass, agefine)$y
pred_mass[,i] <- exp(predict(ss_logmass,agefine)$y)
lam[i] <- ss_logmass$lambda
}
large_lam <- which(lam > 1e-4)
mass_adjusted <- matrix(0,100, length(large_lam))
for ( i in 1: length(large_lam)){
ss_new <- smooth.spline(age_list_new[[large_lam[i]]], log(trait_list[[large_lam[i]]]),
all.knots = TRUE, lambda = 1e-4) # restrict lambda <=1e-4
mass_adjusted[,i] <- exp(predict(ss_new, agefine)$y)
}
colnames(mass_adjusted) <- as.character(large_lam)
## Let reform the smoothed growth curves into a matrix
trait_hat <- pred_mass
for (i in 1: length(large_lam)){
trait_hat[,large_lam[i]] <- mass_adjusted[,i]
}
Align the growth curves using the r package \(\textbf{fdasrvf}\) and extract the mean from the aligned curves.
aligned_mass_process <- time_warping(f=trait_hat, time=agefine)
aligned_mass_curve <- aligned_mass_process$fn
aligned_mean <- aligned_mass_process$fmean
warping_funs <- aligned_mass_process$warping_functions
We run FPCA on the aligned data and the first three principal components will be used as the basis functions for our model.
fpcaobj_mass <- prcomp(x=t(aligned_mass_curve), retx = TRUE, center = TRUE, rank. = 3)
eigen_mass <- fpcaobj_mass$rotation # eigen vectors
## [,1]
## [1,] 2.341877e-16
## [,1]
## [1,] 0
## [,1]
## [1,] -5.20417e-17
We calculate the additive genetic relationship matrix \(\bf{A}\) and fit both fixed and random effects using the same set of functional basis.
pos = df$id[FirstUniqueIdPos] # extract ids for all subjects
sire_id = df$sire[FirstUniqueIdPos] # extract ids for sire
dam_id = df$dam[FirstUniqueIdPos] # extract ids for dam
pede <- editPed(sire = sire_id, dam = dam_id, label = pos)
ped<- with(pede, pedigree(label=label, sire=sire, dam=dam))
A <- getA(ped)[163:1035,163:1035]
Here we compare three different ways to fit data to our genetic model:
We need to interpolate the discrete eigen vectors to eigen functions
and evaluate eigen functions at the original time points. This will be
done using the function convert_to_basisfunctions.
### define the basis functions (eigenfunctions)
phi_list <- list()
# create an empty list which stores eigenfunctions for 873 subjects
# evaluated at the original time points.
for (i in 1:N){
phi <- convert_to_basisfunctions(t = agefine, eigenvecs = eigen_mass[,1:3],
tout = age_list_new[[i]])
phi_list[[i]] <- phi
}
phi <- do.call(rbind,phi_list)
colnames(phi) <- c("phi1", "phi2", "phi3")
### update the dataframe to include basis functions
df1 <- cbind(df,phi)
### use a fixed regression of the same form of random regression
fmmForm1 <- trait ~ df1$phi1 + df1$phi2 + df1$phi3 + (-1 + df1$phi1 + df1$phi2 + df1$phi3 | df1$id) +
(-1 + df1$phi1 + df1$phi2 + df1$phi3 | df1$id)
system.time(
ff1 <- fit_genetic_fmm(fmmForm1, df1, A, eigen_mass)
) # user system elapsed
## 用户 系统 流逝
## 106.60 4.55 195.12
summary(ff1)
## Linear mixed model fit by REML ['lmerMod']
##
## REML criterion at convergence: 60950.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -5.0919 -0.4668 -0.0205 0.4220 5.9236
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## df1.id df1$phi1 20592.53 143.501
## df1$phi2 6157.40 78.469 -0.59
## df1$phi3 723.83 26.904 0.49 -0.99
## df1.id.1 df1$phi1 18316.43 135.338
## df1$phi2 285.54 16.898 -0.66
## df1$phi3 81.57 9.031 0.74 -0.99
## Residual 272.98 16.522
## Number of obs: 6860, groups: df1$id, 873
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -42.764 6.406 -6.676
## df1$phi1 1923.638 50.841 37.836
## df1$phi2 -274.299 42.780 -6.412
## df1$phi3 -38.079 5.636 -6.756
##
## Correlation of Fixed Effects:
## (Intr) df1$p1 df1$p2
## df1$phi1 -0.948
## df1$phi2 0.979 -0.964
## df1$phi3 0.825 -0.700 0.710
Extract the covariance matrices and convert them to covariance functions.
VC1 <- VarCorr(ff1)
CG1 <- VC1[["df1.id"]]
CE1 <- VC1[["df1.id.1"]]
### Convert to genetic covariance function
CG_fun1 <- eigen_mass%*% CG1 %*% t(eigen_mass)
### environmental covariance function
CE_fun1 <- eigen_mass %*% CE1 %*% t(eigen_mass)
### Phenotypic covariance function
P_fun1 <- CG_fun1 + CE_fun1
We interpolate the smoothed data at the original measurement points.
trait_hat_list <- list()
for (i in 1:N){
inter_trait <- convert_to_basisfunctions(t = agefine, eigenvecs = trait_hat[,i],
tout = age_list_new[[i]])
trait_hat_list[[i]] <- inter_trait
}
df2 <- data.frame(id = df$id, trait = unsplit(trait_hat_list, df$id), phi)
fmmForm2 <- trait ~ df2$phi1 + df2$phi2 + df2$phi3 + (-1 + df2$phi1 + df2$phi2 + df2$phi3 | df2$id) +
(-1 + df2$phi1 + df2$phi2 + df2$phi3 | df2$id)
system.time(
ff2 <- fit_genetic_fmm(fmmForm2, df2, A, eigen_mass)
) # user system elapsed
## 用户 系统 流逝
## 61.21 1.97 113.68
summary(ff2)
## Linear mixed model fit by REML ['lmerMod']
##
## REML criterion at convergence: 60361.4
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1137 -0.4699 -0.0377 0.4124 6.1927
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## df2.id df2$phi1 20420.4 142.90
## df2$phi2 5902.0 76.82 -0.59
## df2$phi3 743.5 27.27 0.48 -0.99
## df2.id.1 df2$phi1 18791.0 137.08
## df2$phi2 758.5 27.54 -0.47
## df2$phi3 223.2 14.94 0.50 -1.00
## Residual 241.6 15.54
## Number of obs: 6860, groups: df2$id, 873
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) -37.514 6.032 -6.219
## df2$phi1 1882.149 48.164 39.078
## df2$phi2 -240.549 40.355 -5.961
## df2$phi3 -36.529 5.433 -6.723
##
## Correlation of Fixed Effects:
## (Intr) df2$p1 df2$p2
## df2$phi1 -0.943
## df2$phi2 0.977 -0.960
## df2$phi3 0.806 -0.672 0.679
### Reform the aligned data to a dataframe for model fitting
subjectID <- rep(unique(df$id), each=100)
trait_pred <- c(aligned_mass_curve)
basis1 <- rep(eigen_mass[,1], times = 873)
basis2 <- rep(eigen_mass[,2], times = 873)
basis3 <- rep(eigen_mass[,3], times = 873)
new_df <- data.frame(subjectID, trait_pred, basis1, basis2, basis3)
names(new_df) <- c("subjectID", "trait_hat", "basis1", "basis2", "basis3")
fmeFormula <- trait_hat ~ new_df$basis1 + new_df$basis2 + new_df$basis3 +
(-1 + new_df$basis1 + new_df$basis2 + new_df$basis3 | new_df$subjectID) +
(-1 + new_df$basis1 + new_df$basis2 + new_df$basis3 | new_df$subjectID)
system.time(
fit_sameBasis <- fit_genetic_fmm(formula= fmeFormula, data=new_df, A = A, phi = eigen_mass)
)
## 用户 系统 流逝
## 1179.76 68.99 1894.39
# user system elapsed
summary(fit_sameBasis)
## Linear mixed model fit by REML ['lmerMod']
##
## REML criterion at convergence: 292502.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -15.0463 -0.4291 -0.0128 0.4343 20.6870
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## new_df.subjectID new_df$basis1 14698.909 121.239
## new_df$basis2 761.155 27.589 -0.23
## new_df$basis3 76.205 8.730 -0.37 0.40
## new_df.subjectID.1 new_df$basis1 18469.901 135.904
## new_df$basis2 71.553 8.459 0.61
## new_df$basis3 219.271 14.808 0.19 -0.66
## Residual 1.349 1.161
## Number of obs: 87300, groups: new_df$subjectID, 873
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 12.6225 0.1382 91.311
## new_df$basis1 1513.5783 13.8295 109.445
## new_df$basis2 92.0683 3.0843 29.851
## new_df$basis3 7.8831 1.0701 7.366
##
## Correlation of Fixed Effects:
## (Intr) nw_d$1 nw_d$2
## new_df$bss1 -0.075
## new_df$bss2 0.292 -0.207
## new_df$bss3 0.086 -0.272 0.324